Data Cleaning
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.4 v stringr 1.4.0
## v tidyr 1.1.2 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(ROCR)
survey <- read_csv("survey.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_character(),
## Age = col_double()
## )
## i Use `spec()` for the full column specifications.
summary(survey$Gender)
## Length Class Mode
## 1259 character character
unique(survey$Gender)
## [1] "Female"
## [2] "Male"
## [3] "M"
## [4] "F"
## [5] "male"
## [6] "m"
## [7] "A little about you"
## [8] "female"
## [9] "Guy (-ish) ^_^"
## [10] "male leaning androgynous"
## [11] "Man"
## [12] "fluid"
## [13] "queer"
## [14] "f"
## [15] "Mal"
## [16] "Malr"
## [17] "something kinda male?"
## [18] "Woman"
## [19] "Enby"
## [20] "Androgyne"
## [21] "Agender"
## [22] "Neuter"
## [23] "Cis Man"
## [24] "ostensibly male, unsure what that really means"
## [25] "Male-ish"
## [26] "maile"
## [27] "Trans-female"
## [28] "Cis Female"
## [29] "Cis Male"
## [30] "Male (CIS)"
## [31] "queer/she/they"
## [32] "non-binary"
## [33] "Femake"
## [34] "woman"
## [35] "Make"
## [36] "Nah"
## [37] "Genderqueer"
## [38] "cis-female/femme"
## [39] "Trans woman"
## [40] "msle"
## [41] "Female (trans)"
## [42] "Female (cis)"
## [43] "Mail"
## [44] "cis male"
## [45] "p"
## [46] "femail"
## [47] "All"
survey$Gender[survey$Gender=="Female"|survey$Gender=="Woman"|survey$Gender=="female"|survey$Gender=="f"|survey$Gender=="Femake"|survey$Gender=="woman"|survey$Gender=="Cis Female"|survey$Gender=="Female (cis)"|survey$Gender=="femail"|survey$Gender=="cis-female/femme"|survey$Gender=="Trans woman"|survey$Gender=="Female (trans)"|survey$Gender=="Trans-Female"|survey$Gender=="Trans-female"]<-"F"
survey$Gender[survey$Gender=="m"|survey$Gender=="Guy (-ish) ^_^"|survey$Gender=="Man"|survey$Gender=="Malr"|survey$Gender=="Cis Man"|survey$Gender=="Male-ish"|survey$Gender=="Cis Male"|survey$Gender=="Make"|survey$Gender=="Mail"|survey$Gender=="Male"|survey$Gender=="male"|survey$Gender=="male leaning androgynous"|survey$Gender=="Mal"|survey$Gender=="something kinda male?"|survey$Gender=="ostensibly male, unsure what that really means"|survey$Gender=="maile"|survey$Gender=="Male (CIS)"|survey$Gender=="msle"|survey$Gender=="cis male"]<-"M"
survey$Gender[survey$Gender=="queer"|survey$Gender=="Agender"|survey$Gender=="Nah"|survey$Gender=="Androgyne"|survey$Gender=="fluid"|survey$Gender=="Neuter"|survey$Gender=="queer/she/they"|survey$Gender=="non-binary"|survey$Gender=="Genderqueer"|survey$Gender=="p"|survey$Gender=="All"|survey$Gender=="Agender"|survey$Gender=="Enby"|survey$Gender=="A little about you"]<-"X"
unique(survey$Gender)
## [1] "F" "M" "X"
no_x<-survey[survey$Gender!="X",]
no_x$no_employees<-NULL
no_x$Timestamp<-NULL
survey
## # A tibble: 1,259 x 27
## Timestamp Age Gender Country state self_employed family_history treatment
## <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 8/27/201~ 23 F Austra~ <NA> No Yes Yes
## 2 8/27/201~ 34 M Austra~ <NA> Yes No No
## 3 8/27/201~ 25 M Austra~ <NA> No Yes Yes
## 4 8/27/201~ 22 M Austra~ <NA> Yes Yes Yes
## 5 8/27/201~ 20 M Austra~ <NA> No No No
## 6 8/27/201~ 31 M Austra~ <NA> No No Yes
## 7 8/27/201~ 27 F Austra~ <NA> No Yes Yes
## 8 8/27/201~ 34 M Austra~ <NA> No No No
## 9 8/27/201~ 33 M Austra~ <NA> No No No
## 10 8/27/201~ 27 M Austra~ <NA> No No Yes
## # ... with 1,249 more rows, and 19 more variables: work_interfere <chr>,
## # no_employees <chr>, remote_work <chr>, tech_company <chr>, benefits <chr>,
## # care_options <chr>, wellness_program <chr>, seek_help <chr>,
## # anonymity <chr>, leave <chr>, mental_health_consequence <chr>,
## # phys_health_consequence <chr>, coworkers <chr>, supervisor <chr>,
## # mental_health_interview <chr>, phys_health_interview <chr>,
## # mental_vs_physical <chr>, obs_consequence <chr>, comments <chr>
no_x$Gender[no_x$Gender=="M"]<-1
no_x$Gender[no_x$Gender=="F"]<-0
no_x$treatment[no_x$treatment=="Yes"]<-1
no_x$treatment[no_x$treatment=="No"]<-0
no_x$family_history[no_x$family_history=="Yes"]<-1
no_x$family_history[no_x$family_history=="No"]<-0
no_x$self_employed[no_x$self_employed=="Yes"]<-1
no_x$self_employed[no_x$self_employed=="No"]<-0
no_x$remote_work[no_x$remote_work=="Yes"]<-1
no_x$remote_work[no_x$remote_work=="No"]<-0
no_x$tech_company[no_x$tech_company=="Yes"]<-1
no_x$tech_company[no_x$tech_company=="No"]<-0
no_x$benefits[no_x$benefits=="Yes"]<-1
no_x$benefits[no_x$benefits=="No"]<-0
no_x$benefits[no_x$benefits=="Don't know"]<-2
no_x$care_options[no_x$care_options=="Yes"]<-1
no_x$care_options[no_x$care_options=="No"]<-0
no_x$care_options[no_x$care_options=="Not sure"]<-2
no_x$wellness_program[no_x$wellness_program=="Yes"]<-1
no_x$wellness_program[no_x$wellness_program=="No"]<-0
no_x$wellness_program[no_x$wellness_program=="Don't know"]<-2
no_x$seek_help[no_x$seek_help=="Yes"]<-1
no_x$seek_help[no_x$seek_help=="No"]<-0
no_x$seek_help[no_x$seek_help=="Don't know"]<-2
no_x$anonymity[no_x$anonymity=="Yes"]<-1
no_x$anonymity[no_x$anonymity=="No"]<-0
no_x$anonymity[no_x$anonymity=="Don't know"]<-2
no_x$mental_health_consequence[no_x$mental_health_consequence=="Yes"]<-1
no_x$mental_health_consequence[no_x$mental_health_consequence=="No"]<-0
no_x$mental_health_consequence[no_x$mental_health_consequence=="Maybe"]<-2
no_x$phys_health_consequence[no_x$phys_health_consequence=="Yes"]<-1
no_x$phys_health_consequence[no_x$phys_health_consequence=="No"]<-0
no_x$phys_health_consequence[no_x$phys_health_consequence=="Maybe"]<-2
no_x$supervisor[no_x$supervisor=="Yes"]<-1
no_x$supervisor[no_x$supervisor=="No"]<-0
no_x$supervisor[no_x$supervisor=="Some of them"]<-2
no_x$coworkers[no_x$coworkers=="Yes"]<-1
no_x$coworkers[no_x$coworkers=="No"]<-0
no_x$coworkers[no_x$coworkers=="Some of them"]<-2
no_x$mental_health_interview[no_x$mental_health_interview=="Yes"]<-1
no_x$mental_health_interview[no_x$mental_health_interview=="No"]<-0
no_x$mental_health_interview[no_x$mental_health_interview=="Maybe"]<-2
no_x$phys_health_interview[no_x$phys_health_interview=="Yes"]<-1
no_x$phys_health_interview[no_x$phys_health_interview=="No"]<-0
no_x$phys_health_interview[no_x$phys_health_interview=="Maybe"]<-2
no_x$mental_vs_physical[no_x$mental_vs_physical=="Yes"]<-1
no_x$mental_vs_physical[no_x$mental_vs_physical=="No"]<-0
no_x$mental_vs_physical[no_x$mental_vs_physical=="Don't know"]<-2
no_x$obs_consequence[no_x$obs_consequence=="Yes"]<-1
no_x$obs_consequence[no_x$obs_consequence=="No"]<-0
no_x$obs_consequence[no_x$obs_consequence=="Maybe"]<-2
EDA
sum(no_x$treatment==1)/1246
## [1] 0.5016051
sum(no_x$Gender==1)/1246
## [1] 0.7985554
sum(no_x$remote_work==1)/1246
## [1] 0.2985554
sum(no_x$Gender==1)/1246
## [1] 0.7985554
sum(is.na(survey$comments))
## [1] 1096
no_x$work_interfere<-NULL
no_x$leave<-NULL
uk_us<-filter(no_x,no_x$Country=="United Kingdom"|no_x$Country=="United States")
us<-filter(no_x,no_x$Country=="United States")
uk<-filter(no_x,no_x$Country=="United Kingdom")
sum(uk$treatment==1)/181
## [1] 0.4917127
sum(us$treatment==1)/746
## [1] 0.5442359
sum(uk$family_history==1)/181
## [1] 0.3093923
sum(us$family_history==1)/746
## [1] 0.4356568
sum(uk$remote_work==1)/181
## [1] 0.2265193
sum(us$remote_work==1)/746
## [1] 0.3163539
sum(uk$care_options==1)/181
## [1] 0.2099448
sum(us$care_options==1)/746
## [1] 0.4115282
sum(uk$benefits==1)/181
## [1] 0.1104972
sum(us$benefits==1)/746
## [1] 0.5268097
sum(uk$seek_help==1)/181
## [1] 0.1381215
sum(us$seek_help==1)/746
## [1] 0.2520107
sum(uk$wellness_program==1)/181
## [1] 0.08839779
sum(us$wellness_program==1)/746
## [1] 0.2211796
sum(uk$obs_consequence==1)/181
## [1] 0.1933702
sum(us$obs_consequence==1)/746
## [1] 0.1179625
mean(uk$treatment)
## Warning in mean.default(uk$treatment): argument is not numeric or logical:
## returning NA
## [1] NA
mean(uk$family_history)
## Warning in mean.default(uk$family_history): argument is not numeric or logical:
## returning NA
## [1] NA
ggplot(us)+geom_bar(aes(x=factor(benefits),fill=factor(benefits)))+xlab("Survey Response")+ylab("Number of Respondents")+labs(title="United States' responses to if their workplace offers mental health benefits",fill="Survey Response")+scale_fill_manual(values=
c('red','green','yellow'),labels=
c("No","Yes","Don't Know"))

ggplot(uk)+geom_bar(aes(x=factor(benefits),fill=factor(benefits)))+xlab("Survey Response")+ylab("Number of Respondents")+labs(title="United Kingdoms' responses to if their workplace offers mental health benefits",fill="Survey Response")+scale_fill_manual(values=
c('red','green','yellow'),labels=
c("No","Yes","Don't Know"))

ggplot(us)+geom_bar(aes(x=factor(care_options),fill=factor(care_options)))+xlab("Survey Response")+ylab("Number of Respondents")+labs(title="United States' responses to if they know about the care options provided by their employer",fill="Survey Response")+scale_fill_manual(values=
c('red','green','yellow'),labels=
c("No","Yes","Not sure"))

ggplot(uk)+geom_bar(aes(x=factor(care_options),fill=factor(care_options)))+xlab("Survey Response")+ylab("Number of Respondents")+labs(title="United Kingdoms' responses to if they know about the care options provided by their employer",fill="Survey Response")+scale_fill_manual(values=
c('red','green','yellow'),labels=
c("No","Yes","Not sure"))

ustreatment<-us[us$treatment==1,]
uktreatment<-uk[uk$treatment==1,]
ggplot(ustreatment) +geom_bar(aes(x = factor(Gender),fill=factor(Gender))) +ggtitle("Treatment Sought by Gender in the United States")+labs(fill="Gender")+xlab("Gender")+ylab("Number of Respondents")+scale_fill_manual(values=c('pink','blue'),labels=c("Female","Male"))

ggplot(uktreatment) +geom_bar(aes(x = factor(Gender),fill=factor(Gender))) +ggtitle("Treatment Sought by Gender in the United Kingdom")+labs(fill="Gender")+xlab("Gender")+ylab("Number of Respondents")+scale_fill_manual(values=c('pink','blue'),labels=c("Female","Male"))

summary(uk_us)
## Age Gender Country state
## Min. :-1726.00 Length:927 Length:927 Length:927
## 1st Qu.: 27.00 Class :character Class :character Class :character
## Median : 32.00 Mode :character Mode :character Mode :character
## Mean : 31.03
## 3rd Qu.: 37.00
## Max. : 329.00
## self_employed family_history treatment remote_work
## Length:927 Length:927 Length:927 Length:927
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## tech_company benefits care_options wellness_program
## Length:927 Length:927 Length:927 Length:927
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## seek_help anonymity mental_health_consequence
## Length:927 Length:927 Length:927
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## phys_health_consequence coworkers supervisor
## Length:927 Length:927 Length:927
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## mental_health_interview phys_health_interview mental_vs_physical
## Length:927 Length:927 Length:927
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## obs_consequence comments
## Length:927 Length:927
## Class :character Class :character
## Mode :character Mode :character
##
##
##
newdf<-as.data.frame(unclass(uk_us))
summary(uk_us)
## Age Gender Country state
## Min. :-1726.00 Length:927 Length:927 Length:927
## 1st Qu.: 27.00 Class :character Class :character Class :character
## Median : 32.00 Mode :character Mode :character Mode :character
## Mean : 31.03
## 3rd Qu.: 37.00
## Max. : 329.00
## self_employed family_history treatment remote_work
## Length:927 Length:927 Length:927 Length:927
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## tech_company benefits care_options wellness_program
## Length:927 Length:927 Length:927 Length:927
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## seek_help anonymity mental_health_consequence
## Length:927 Length:927 Length:927
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## phys_health_consequence coworkers supervisor
## Length:927 Length:927 Length:927
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## mental_health_interview phys_health_interview mental_vs_physical
## Length:927 Length:927 Length:927
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## obs_consequence comments
## Length:927 Length:927
## Class :character Class :character
## Mode :character Mode :character
##
##
##
str(uk_us)
## tibble [927 x 23] (S3: tbl_df/tbl/data.frame)
## $ Age : num [1:927] 31 23 37 32 30 24 28 27 38 19 ...
## $ Gender : chr [1:927] "1" "1" "1" "1" ...
## $ Country : chr [1:927] "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
## $ state : chr [1:927] NA NA NA NA ...
## $ self_employed : chr [1:927] NA NA "0" "0" ...
## $ family_history : chr [1:927] "1" "0" "0" "0" ...
## $ treatment : chr [1:927] "1" "1" "0" "0" ...
## $ remote_work : chr [1:927] "0" "1" "0" "1" ...
## $ tech_company : chr [1:927] "1" "1" "1" "1" ...
## $ benefits : chr [1:927] "0" "2" "0" "0" ...
## $ care_options : chr [1:927] "1" "0" "0" "0" ...
## $ wellness_program : chr [1:927] "0" "2" "0" "0" ...
## $ seek_help : chr [1:927] "0" "2" "0" "0" ...
## $ anonymity : chr [1:927] "0" "2" "2" "2" ...
## $ mental_health_consequence: chr [1:927] "1" "2" "1" "1" ...
## $ phys_health_consequence : chr [1:927] "1" "0" "2" "1" ...
## $ coworkers : chr [1:927] "2" "2" "2" "2" ...
## $ supervisor : chr [1:927] "0" "0" "0" "2" ...
## $ mental_health_interview : chr [1:927] "2" "2" "0" "0" ...
## $ phys_health_interview : chr [1:927] "2" "2" "2" "2" ...
## $ mental_vs_physical : chr [1:927] "0" "0" "0" "0" ...
## $ obs_consequence : chr [1:927] "1" "0" "0" "0" ...
## $ comments : chr [1:927] NA "My company does provide healthcare but not to me as I'm on a fixed-term contract. The mental healthcare I use i"| __truncated__ NA NA ...
f <- sapply(newdf, is.factor)
f
## Age Gender Country
## FALSE FALSE FALSE
## state self_employed family_history
## FALSE FALSE FALSE
## treatment remote_work tech_company
## FALSE FALSE FALSE
## benefits care_options wellness_program
## FALSE FALSE FALSE
## seek_help anonymity mental_health_consequence
## FALSE FALSE FALSE
## phys_health_consequence coworkers supervisor
## FALSE FALSE FALSE
## mental_health_interview phys_health_interview mental_vs_physical
## FALSE FALSE FALSE
## obs_consequence comments
## FALSE FALSE
survey_data <- read.csv('survey2.csv')
survey_data[,10] <- NULL
survey_data <- survey_data[survey_data$Country == "United States" | survey_data$Country == "United Kingdom", ]
length(unique(survey_data$state))
## [1] 46
states_and_count <- aggregate(data.frame(count = survey_data$state), list(value = survey_data$state), length)
usstates_and_loc <- read.csv('usstates_count.csv')
states <-
geojson_read(
x = "https://raw.githubusercontent.com/PublicaMundi/MappingAPI/master/data/geojson/us-states.json"
, what = "sp"
)
states$count<-usstates_and_loc$count
m <- leaflet(states) %>%
setView(-96, 37.8, 4) %>%
addTiles()
m %>% addPolygons()
bins <- c(0, 1, 10, 20, 50, 100, 200)
pal <- colorBin("YlOrRd", domain = states$count, bins = bins)
labels <- sprintf(
"<strong>%s</strong><br/>%g survey respondents",
states$name, states$count
) %>% lapply(htmltools::HTML)
m <- m %>% addPolygons(
fillColor = ~pal(count),
weight = 2,
opacity = 1,
color = "white",
dashArray = "3",
fillOpacity = 0.7,
highlight = highlightOptions(
weight = 5,
color = "#666",
dashArray = "",
fillOpacity = 0.7,
bringToFront = TRUE),
label = labels,
labelOptions = labelOptions(
style = list("font-weight" = "normal", padding = "3px 8px"),
textsize = "15px",
direction = "auto")
)
m %>% addLegend(pal = pal, values = ~count, opacity = 0.7, title = NULL,
position = "bottomright")
m
Model Building
library(knitr)
library(plotly)
survey_data <- read.csv('survey2.csv')
survey_data[,10] <- NULL
survey_data <- survey_data[survey_data$Country == "United States" | survey_data$Country == "United Kingdom", ]
survey_data$no_employees<-NULL
survey_data$Timestamp<-NULL
survey_data$comments <- NULL
survey_data$state <- NULL
survey_data$X <- NULL
survey_data00 <- na.omit(survey_data)
survey_data000<-survey_data00[which(survey_data00$Gender!="X"),]
survey_data2 <- survey_data000[1:914,]
survey_data2<-survey_data2[-c(which(survey_data2$Age<18)),]
survey_data2<-survey_data2[-c(which(survey_data2$Age>100)),]
survey_data2 <- survey_data2 %>% mutate(Age = case_when(Age>=70&Age<=79~'6',
Age>=60&Age<=69~'5',
Age>=50&Age<=59~'4',
Age >= 40 & Age <= 49 ~ '3',
Age >= 30 & Age <= 39 ~ '2',
Age >= 20 & Age <= 29 ~ '1',
Age>=10&Age<=19~'0'))
survey_data2$leave[survey_data2$leave == "Very easy"] <- "Easy"
survey_data2$leave[survey_data2$leave == "Somewhat easy"] <- "Easy"
survey_data2$leave[survey_data2$leave == "Very difficult"] <- "Difficult"
survey_data2$leave[survey_data2$leave == "Somewhat difficult"] <- "Difficult"
factorized <- lapply(survey_data2[,2:22], factor)
double1 <- as.factor(survey_data2$Age)
library(dplyr)
factorized$Age <- double1
factorized <- as.data.frame(factorized)
UK_rf_input <- subset(factorized, Country == "United Kingdom")
UK_rf_input$Country <- NULL
US_rf_input <- subset(factorized, Country == "United States")
US_rf_input$Country <- NULL
UK_rf_input<-UK_rf_input[sample(nrow(UK_rf_input),1000,replace=TRUE),]
US_rf_input<-US_rf_input[sample(nrow(US_rf_input),1000,replace=TRUE),]
sum(UK_rf_input$treatment==1)/1000
## [1] 0
sum(US_rf_input$treatment==1)/1000
## [1] 0
UK_rf_label<-UK_rf_input[,4]
UK_rf_features<-UK_rf_input[,-4]
set.seed(1950)
US_rf_label<-US_rf_input[,4]
US_rf_features<-US_rf_input[,-4]
UK_rf_input$treatment <- as.factor(ifelse(UK_rf_input$treatment=="No", "0", "1"))
US_rf_input$treatment <- as.factor(ifelse(US_rf_input$treatment=="No", "0", "1"))
sample_rows = 1:nrow(UK_rf_input)
set.seed(1984)
test_rows = sample(sample_rows,
dim(UK_rf_input)[1]*.10,
replace = FALSE)
UK_train = UK_rf_input[-test_rows,]
UK_test = UK_rf_input[test_rows,]
set.seed(1950)
UK_initial_RF = randomForest(treatment~.,
UK_train,
ntree = 500,
mtry = 12,
replace = TRUE,
sampsize = 100,
nodesize = 5,
importance = TRUE,
proximity = FALSE,
norm.votes = TRUE,
do.trace = FALSE,
keep.forest = TRUE,
keep.inbag = TRUE)
kable(UK_initial_RF$confusion)
| 0 |
416 |
31 |
0.0693512 |
| 1 |
71 |
382 |
0.1567329 |
UK_RF_acc = sum(UK_initial_RF$confusion[row(UK_initial_RF$confusion) == col(UK_initial_RF$confusion)]) / sum(UK_initial_RF$confusion)
UK_RF_acc
## [1] 0.886444
ukpred<-data.frame(UK_initial_RF$predicted,UK_train)
ukpredlist<-as.list(UK_initial_RF$predicted)
labellist<-as.list(UK_train[,4])
UK_RF_2_prediction = as.data.frame(as.numeric(as.character(UK_initial_RF$votes[,2])))
UK_train_actual = data.frame(UK_train[,4])
UK_prediction_comparison = prediction(UK_RF_2_prediction, UK_train_actual)
UK_pred_performance = performance(UK_prediction_comparison,
measure = "tpr",
x.measure = "fpr")
UK_rates = data.frame(fp = UK_prediction_comparison@fp,
tp = UK_prediction_comparison@tp,
tn = UK_prediction_comparison@tn,
fn = UK_prediction_comparison@fn)
colnames(UK_rates) = c("fp", "tp", "tn", "fn")
tpr = UK_rates$tp / (UK_rates$tp + UK_rates$fn)
fpr = UK_rates$fp / (UK_rates$fp + UK_rates$tn)
UK_rates_comparison = data.frame(UK_pred_performance@x.values,
UK_pred_performance@y.values,
fpr,
tpr)
colnames(UK_rates_comparison) = c("x.values","y.values","fpr","tpr")
while (!is.null(dev.list())) dev.off()
plot(UK_pred_performance,
col = "orange",
lwd = 3,
main = "ROC curve")
grid(col = "black")
abline(a = 0,
b = 1,
lwd = 2,
lty = 2,
col = "blue")
UK_auc_RF = performance(UK_prediction_comparison,
"auc")@y.values[[1]]
UK_auc_RF
## [1] 0.9617662
text(x = 0.5,
y = 0.5,
labels = paste0("AUC = ",
round(UK_auc_RF,
2)))
kable(UK_initial_RF$confusion)
| 0 |
416 |
31 |
0.0693512 |
| 1 |
71 |
382 |
0.1567329 |
UK_RF_acc = sum(UK_initial_RF$confusion[row(UK_initial_RF$confusion) == col(UK_initial_RF$confusion)]) / sum(UK_initial_RF$confusion)
UK_RF_acc
## [1] 0.886444
confusionMatrix(as.factor(UK_initial_RF$predicted), as.factor(UK_train$treatment), positive = "1", dnn=c("Prediction", "Actual"), mode = "everything")
## Confusion Matrix and Statistics
##
## Actual
## Prediction 0 1
## 0 416 71
## 1 31 382
##
## Accuracy : 0.8867
## 95% CI : (0.8641, 0.9066)
## No Information Rate : 0.5033
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7735
##
## Mcnemar's Test P-Value : 0.0001127
##
## Sensitivity : 0.8433
## Specificity : 0.9306
## Pos Pred Value : 0.9249
## Neg Pred Value : 0.8542
## Precision : 0.9249
## Recall : 0.8433
## F1 : 0.8822
## Prevalence : 0.5033
## Detection Rate : 0.4244
## Detection Prevalence : 0.4589
## Balanced Accuracy : 0.8870
##
## 'Positive' Class : 1
##
UK_double_original<-as.double(UK_rf_input[,4])
UK_double_predictions<-as.double(UK_initial_RF$predicted)
UK_residuals<-as.data.frame(UK_double_predictions-UK_double_original)
## Warning in UK_double_predictions - UK_double_original: longer object length is
## not a multiple of shorter object length
sum(UK_double_predictions-UK_double_original)/1000
## Warning in UK_double_predictions - UK_double_original: longer object length is
## not a multiple of shorter object length
## [1] -0.044
kable(as.data.frame(importance(UK_initial_RF, type = 2, scale = TRUE)))
| Gender |
1.109131 |
| self_employed |
1.639575 |
| family_history |
2.943197 |
| remote_work |
1.126799 |
| tech_company |
1.114462 |
| benefits |
2.429243 |
| care_options |
2.612736 |
| wellness_program |
1.276998 |
| seek_help |
1.679600 |
| anonymity |
2.064385 |
| leave |
2.348194 |
| mental_health_consequence |
2.337719 |
| phys_health_consequence |
1.229083 |
| coworkers |
2.983042 |
| supervisor |
2.011853 |
| mental_health_interview |
1.271313 |
| phys_health_interview |
2.293574 |
| mental_vs_physical |
2.921869 |
| obs_consequence |
1.483449 |
| Age |
3.357769 |
importance_colmn <- as.data.frame(UK_initial_RF$importance)
feat_imp_df <- importance(UK_initial_RF) %>%
data.frame() %>%
mutate(feature = row.names(.))
ggplot(feat_imp_df, aes(x = reorder(feature, MeanDecreaseGini),
y = MeanDecreaseGini)) +
geom_bar(stat='identity') +
coord_flip() +
theme_classic() +
labs(
x = "Feature",
y = "Importance",
title = "Mean Decrease Gini Plot U.K."
)

UK_RF_error = data.frame(1:nrow(UK_initial_RF$err.rate),
UK_initial_RF$err.rate)
colnames(UK_RF_error) = c("Number of Trees", "OutBoxerror",
"No", "Yes")
UK_RF_error$Diff <- UK_RF_error$Yes-UK_RF_error$No
fig_UK <- plot_ly(x = UK_RF_error$`Number of Trees`, y=UK_RF_error$Diff,name="Diff", type = 'scatter', mode = 'lines')
fig_UK <- fig_UK %>% add_trace(y=UK_RF_error$OutBoxerror, name="O.O.B error")
fig_UK <- fig_UK %>% add_trace(y=UK_RF_error$No, name="No")
fig_UK <- fig_UK %>% add_trace(y=UK_RF_error$Yes, name="Yes")
fig_UK
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
kable(UK_initial_RF$confusion)
| 0 |
416 |
31 |
0.0693512 |
| 1 |
71 |
382 |
0.1567329 |
UK_RF_acc = sum(UK_initial_RF$confusion[row(UK_initial_RF$confusion) == col(UK_initial_RF$confusion)]) / sum(UK_initial_RF$confusion)
UK_RF_acc
## [1] 0.886444
kable(head(UK_RF_error[order(UK_RF_error$OutBoxerror, decreasing = FALSE),], n=5))
| 495 |
495 |
0.1122222 |
0.0671141 |
0.1567329 |
0.0896188 |
| 496 |
496 |
0.1122222 |
0.0671141 |
0.1567329 |
0.0896188 |
| 497 |
497 |
0.1122222 |
0.0671141 |
0.1567329 |
0.0896188 |
| 498 |
498 |
0.1122222 |
0.0671141 |
0.1567329 |
0.0896188 |
| 499 |
499 |
0.1122222 |
0.0671141 |
0.1567329 |
0.0896188 |
sample_rows = 1:nrow(US_rf_input)
set.seed(1984)
test_rows = sample(sample_rows,
dim(US_rf_input)[1]*.10,
replace = FALSE)
US_train = US_rf_input[-test_rows,]
US_test = US_rf_input[test_rows,]
set.seed(1950)
US_initial_RF = randomForest(treatment~.,
US_train,
ntree = 500,
mtry = 15,
replace = TRUE,
sampsize = 100,
nodesize = 5,
importance = TRUE,
proximity = FALSE,
norm.votes = TRUE,
do.trace = FALSE,
keep.forest = TRUE,
keep.inbag = TRUE)
kable(US_initial_RF$confusion)
| 0 |
266 |
145 |
0.3527981 |
| 1 |
70 |
419 |
0.1431493 |
US_RF_acc = sum(US_initial_RF$confusion[row(US_initial_RF$confusion) == col(US_initial_RF$confusion)]) / sum(US_initial_RF$confusion)
US_RF_acc
## [1] 0.7606919
US_RF_2_prediction = as.data.frame(as.numeric(as.character(US_initial_RF$votes[,2])))
US_train_actual = data.frame(US_train[,4])
US_prediction_comparison = prediction(US_RF_2_prediction, US_train_actual)
US_pred_performance = performance(US_prediction_comparison,
measure = "tpr",
x.measure = "fpr")
US_rates = data.frame(fp = US_prediction_comparison@fp,
tp = US_prediction_comparison@tp,
tn = US_prediction_comparison@tn,
fn = US_prediction_comparison@fn)
colnames(US_rates) = c("fp", "tp", "tn", "fn")
tpr = US_rates$tp / (US_rates$tp + US_rates$fn)
fpr = US_rates$fp / (US_rates$fp + US_rates$tn)
US_rates_comparison = data.frame(US_pred_performance@x.values,
US_pred_performance@y.values,
fpr,
tpr)
colnames(US_rates_comparison) = c("x.values","y.values","fpr","tpr")
while (!is.null(dev.list())) dev.off()
plot(US_pred_performance,
col = "orange",
lwd = 3,
main = "ROC curve")
grid(col = "black")
abline(a = 0,
b = 1,
lwd = 2,
lty = 2,
col = "blue")
US_auc_RF = performance(US_prediction_comparison,
"auc")@y.values[[1]]
US_auc_RF
## [1] 0.8515118
text(x = 0.5,
y = 0.5,
labels = paste0("AUC = ",
round(US_auc_RF,
2)))
US_double_original<-as.double(US_rf_input[,4])
US_double_predictions<-as.double(US_initial_RF$predicted)
US_residuals<-as.data.frame(ifelse(US_double_predictions-US_double_original==0,0,1))
## Warning in US_double_predictions - US_double_original: longer object length is
## not a multiple of shorter object length
1-sum(US_residuals)/1000
## [1] 0.486
sum(US_residuals==0)
## [1] 486
UK_double_original<-as.double(UK_rf_input[,4])
UK_double_predictions<-as.double(UK_initial_RF$predicted)
UK_residuals<-as.data.frame(ifelse(UK_double_predictions-UK_double_original==0,0,1))
## Warning in UK_double_predictions - UK_double_original: longer object length is
## not a multiple of shorter object length
1-sum(UK_residuals)/1000
## [1] 0.532
sum(UK_residuals==0)
## [1] 532
kable(as.data.frame(importance(US_initial_RF, type = 2, scale = TRUE)))
| Gender |
0.8894703 |
| self_employed |
0.5259106 |
| family_history |
6.9679091 |
| remote_work |
0.9321551 |
| tech_company |
0.8632699 |
| benefits |
1.7220652 |
| care_options |
5.2530838 |
| wellness_program |
1.3839687 |
| seek_help |
1.7950005 |
| anonymity |
1.4661827 |
| leave |
2.4058954 |
| mental_health_consequence |
2.1110134 |
| phys_health_consequence |
1.2674832 |
| coworkers |
1.5316040 |
| supervisor |
1.8755152 |
| mental_health_interview |
0.9744162 |
| phys_health_interview |
2.0551686 |
| mental_vs_physical |
1.6707267 |
| obs_consequence |
0.8177320 |
| Age |
3.2120433 |
importance_colmn <- as.data.frame(US_initial_RF$importance)
feat_imp_df <- importance(US_initial_RF) %>%
data.frame() %>%
mutate(feature = row.names(.))
ggplot(feat_imp_df, aes(x = reorder(feature, MeanDecreaseGini),
y = MeanDecreaseGini)) +
geom_bar(stat='identity') +
coord_flip() +
theme_classic() +
labs(
x = "Feature",
y = "Importance",
title = "Mean Decrease Gini Plot U.S."
)

US_RF_error = data.frame(1:nrow(US_initial_RF$err.rate),
US_initial_RF$err.rate)
colnames(US_RF_error) = c("Number of Trees", "OutBoxerror",
"No", "Yes")
US_RF_error$Diff <- US_RF_error$Yes-US_RF_error$No
fig_US <- plot_ly(x = US_RF_error$`Number of Trees`, y=US_RF_error$Diff,name="Diff", type = 'scatter', mode = 'lines')
fig_US <- fig_US %>% add_trace(y=US_RF_error$OutBoxerror, name="O.O.B error")
fig_US <- fig_US %>% add_trace(y=US_RF_error$No, name="No")
fig_US <- fig_US %>% add_trace(y=US_RF_error$Yes, name="Yes")
fig_US
kable(US_initial_RF$confusion)
| 0 |
266 |
145 |
0.3527981 |
| 1 |
70 |
419 |
0.1431493 |
US_RF_acc = sum(US_initial_RF$confusion[row(US_initial_RF$confusion) == col(US_initial_RF$confusion)]) / sum(US_initial_RF$confusion)
US_RF_acc
## [1] 0.7606919
confusionMatrix(as.factor(US_initial_RF$predicted), as.factor(US_train$treatment), positive = "1", dnn=c("Prediction", "Actual"), mode = "everything")
## Confusion Matrix and Statistics
##
## Actual
## Prediction 0 1
## 0 266 70
## 1 145 419
##
## Accuracy : 0.7611
## 95% CI : (0.7319, 0.7886)
## No Information Rate : 0.5433
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5115
##
## Mcnemar's Test P-Value : 4.494e-07
##
## Sensitivity : 0.8569
## Specificity : 0.6472
## Pos Pred Value : 0.7429
## Neg Pred Value : 0.7917
## Precision : 0.7429
## Recall : 0.8569
## F1 : 0.7958
## Prevalence : 0.5433
## Detection Rate : 0.4656
## Detection Prevalence : 0.6267
## Balanced Accuracy : 0.7520
##
## 'Positive' Class : 1
##
kable(head(US_RF_error[order(US_RF_error$OutBoxerror, decreasing = FALSE),], n=5))
| 46 |
46 |
0.2177778 |
0.2895377 |
0.1574642 |
-0.1320735 |
| 47 |
47 |
0.2188889 |
0.2895377 |
0.1595092 |
-0.1300285 |
| 48 |
48 |
0.2188889 |
0.2919708 |
0.1574642 |
-0.1345066 |
| 51 |
51 |
0.2211111 |
0.2944039 |
0.1595092 |
-0.1348947 |
| 52 |
52 |
0.2211111 |
0.2944039 |
0.1595092 |
-0.1348947 |
Model Comparison
prop.test(x = c(506,514), n = c(1000, 1000),
alternative = "two.sided",correct=FALSE)
##
## 2-sample test for equality of proportions without continuity
## correction
##
## data: c(506, 514) out of c(1000, 1000)
## X-squared = 0.12805, df = 1, p-value = 0.7205
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.05181596 0.03581596
## sample estimates:
## prop 1 prop 2
## 0.506 0.514